library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(CodeClanData)
## 
## Attaching package: 'CodeClanData'
## The following object is masked from 'package:dplyr':
## 
##     starwars
## The following object is masked from 'package:tidyr':
## 
##     population
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(viridis)
## Loading required package: viridisLite
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

1 ggplot2 homework

1.1 Question 1.

Load in the libraries you need, and look at the backpack data. Take note of variable names and dimensions.

glimpse(backpack)
## Rows: 100
## Columns: 9
## $ backpack_weight <int> 9, 8, 10, 6, 8, 5, 8, 4, 5, 2, 8, 21, 11, 11, 12, 11, ~
## $ body_weight     <int> 125, 195, 120, 155, 180, 240, 170, 185, 130, 120, 135,~
## $ ratio           <dbl> 0.0720000, 0.0410256, 0.0833333, 0.0387097, 0.0444444,~
## $ back_problems   <int> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ~
## $ major           <fct> Bio, Philosophy, GRC, CSC, EE, History, CM, ARCE, Bio,~
## $ year            <int> 3, 5, 4, 6, 2, 0, 3, 5, 4, 5, 3, 5, 4, 4, 3, 4, 4, 3, ~
## $ sex             <fct> Female, Male, Female, Male, Female, Male, Male, Female~
## $ status          <fct> U, U, U, G, U, G, U, U, U, U, U, U, U, U, U, U, U, U, ~
## $ units           <int> 13, 12, 14, 0, 14, 0, 15, 18, 14, 8, 15, 12, 16, 16, 1~
summary(backpack)
##  backpack_weight  body_weight        ratio         back_problems      major   
##  Min.   : 2.00   Min.   :105.0   Min.   :0.01600   Min.   :0.00   Bio    : 9  
##  1st Qu.: 8.00   1st Qu.:130.0   1st Qu.:0.05121   1st Qu.:0.00   Bus    : 8  
##  Median :11.00   Median :147.5   Median :0.07143   Median :0.00   LS     : 7  
##  Mean   :11.66   Mean   :153.1   Mean   :0.07713   Mean   :0.32   ME     : 6  
##  3rd Qu.:14.25   3rd Qu.:170.0   3rd Qu.:0.09630   3rd Qu.:1.00   CPE    : 5  
##  Max.   :35.00   Max.   :270.0   Max.   :0.18103   Max.   :1.00   AGB    : 4  
##                                                                   (Other):61  
##       year         sex     status     units      
##  Min.   :0.0   Female:55   G: 3   Min.   : 0.00  
##  1st Qu.:2.0   Male  :45   U:97   1st Qu.:13.00  
##  Median :3.0                      Median :15.00  
##  Mean   :3.2                      Mean   :14.27  
##  3rd Qu.:4.0                      3rd Qu.:16.00  
##  Max.   :6.0                      Max.   :19.00  
## 

There are three factors and quantative data such as backpack_weight and body_weight as well as categorical data like gender.

1.2 Question 2.

What is the distribution of backpack weights that students are carrying? Choose an appropriate graph to show the distribution.

hwk_theme <-  theme(title = element_text(size = 14),
                    text  = element_text(size = 12))

backpack %>%
  clean_names() %>%
  ggplot(aes(x = body_weight))  +
  geom_histogram(bins = 20, col = "white", fill = "red") +
  labs(
    x        = "\nBodyweight",
    y        = "Count",
    title    = "Bodyweight distribution of students with backpacks",
    subtitle = "Weight expressed in lbs\n"
  ) +
  hwk_theme 

The weights are fairly concentrated with few outliers and a mean of 153.1 and 1st and 3rd quartile values of 130 and 170. These values would fall within general expectations of young students.

1.3 Question 3.

Are there differences in backpack weight between sex?

backpack %>%
  clean_names()  %>%
  ggplot(aes(x = sex, y = backpack_weight, color = backpack_weight)) +
  geom_col() +
    labs(
    x        = "\nSex",
    y        = "Backpack Weight",
    title    = "Backpack weight versus sex for students",
    subtitle = "Weight expressed in lbs\n"
  ) +
  hwk_theme 

These data would be surprising as males tend to be larger and, therefore, it would be expected that they could and would carry more. In fact the females carry marginally more and, taking into account their lower body weight this requires more effort. There were more female respondents than male and when the mean is calculated, below, this conflicts with the grand total expressed in the graph. The graph is, therefore, misleading

backpack %>% 
  filter(!is.na(sex) & !is.na(backpack_weight)) %>% 
  select(sex, backpack_weight) %>%
  group_by(sex) %>%
  summarise(backpack_weight_mean = mean(backpack_weight))

1.4 Question 4.

What is the relationship between body weight and backpack weight?

backpack %>%
  clean_names()  %>%
  ggplot(aes(x = body_weight, y = backpack_weight, color = backpack_weight)) +
  geom_line() +
  geom_smooth() +
    labs(
    x        = "\nBody weight",
    y        = "Backpack Weight",
    title    = "Backpack weight versus body weight for student backpacks",
    subtitle = "Weight expressed in lbs\n"
  ) +
  hwk_theme 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

These data are within perceived expectations as, though we can’t tell whether high body weights are obesity or muscle, general expectations would be that the small and overweight would choose lower backpack weights as it would require less energy. Optimum bodyweights tend to carry larger backpacks.

1.5 Question 5.

Is this relationship different between men and women? Choose an appropriate plot and interpret your findings.

backpack %>%
  clean_names()  %>%
  filter(!is.na(sex) & !is.na(backpack_weight) & !is.na(body_weight)) %>% 
  select(sex, backpack_weight, body_weight) %>%
  group_by(sex) %>%
  summarise(backpack_weight_mean = mean(backpack_weight), 
            body_weight_mean = mean(body_weight)) 
backpack %>%
  clean_names()  %>%
  filter(!is.na(sex) & !is.na(backpack_weight) & !is.na(body_weight)) %>% 
  select(sex, backpack_weight, body_weight) %>%
  group_by(sex) %>%
  summarise(backpack_weight_mean = mean(backpack_weight), 
            body_weight_mean = mean(body_weight))  %>%
  ggplot(aes(x = body_weight_mean, y = backpack_weight_mean, fill = sex)) +
  geom_col( width = 4) +
    labs(
    x        = "\nBody weight mean",
    y        = "Backpack weight mean",
    title    = "Backpack weight versus body weight means for student backpacks",
    subtitle = "Weight expressed in lbs\n"
  ) +
  hwk_theme 

As above initial plotting was misleading as it took into account a higher number of respondents for females and compared grand totals. Therefore this simple plot shows the mean values and weight carried for genders and they fall into expectations. The higher body weight of males probably accounts for a marginal increase in the mean of the backpacks carried.

Taking into account the ratio and using a scatter plot

backpack %>%
  clean_names()  %>%
  filter(!is.na(sex) & !is.na(backpack_weight) & !is.na(body_weight)) %>% 
  select(sex, backpack_weight, body_weight, ratio) %>%
  ggplot(aes(x = body_weight, y = ratio, fill = sex), width =4) +
  geom_col() +
    labs(
    x        = "\nBody weight ",
    y        = "Backpack weight / bodyweight ratio",
    title    = "Backpack weight versus body weight ratio for student backpacks",
    subtitle = "Weight expressed in lbs\n"
  ) +
  hwk_theme 

AS above this suggests that the ratio of body weight to backpack weight is higher for females as they have smaller body weights and carry nearly the same backpack weight as males.

1.6 Question 6.

What proportion of students report back pain?

backpack_health <- backpack %>%
  clean_names()  %>%
  select(back_problems) %>%
  group_by(back_problems) %>%
  count()  
backpack_health
backpack_health %>%
  ggplot(aes(x = factor(back_problems), y = factor(n), fill = back_problems)) +
  geom_col() +
    labs(
    x        = "\nback problems",
    y        = "students",
    title    = "Proportion of students with and without back problems",
    subtitle = "0 = no back problems and 1 = back problems"
  ) +
  hwk_theme

Roughly a third of students reported back problems. This would seem a very high figure.

However, there is no definition of ‘back problems’ and, therefore, it may be that a stiff back once a term is equated to regular back pain and loss of sleep etc.

2 Question 7.

Is this proportion different between men and women?

backpack_health <- backpack %>%
  clean_names()  %>%
  select(back_problems, sex) %>%
  mutate(back_problems = case_when(back_problems == 0 ~ "No problems",
                                   TRUE ~ "Back Problems")) %>%
  group_by(back_problems, sex) %>%
  count()  
backpack_health
backpack_health %>%
  ggplot(aes(x = factor(back_problems), y = factor(n), fill = factor(sex))) +
  geom_bar(position = "stack", stat = "identity") +
    labs(
    x        = "\nback problems",
    y        = "students",
    title    = "Proportion of students with and without back problems by gender"
  ) +
  hwk_theme

This suggests that the majority, two thirds, of individuals with reported back problems are female. This may relate to the high ratio of body weight to back pack weight, but this is speculation even though grounded on reasonable assumptions.

2.1 Question 8

Now you’ve investigated your data separately, create plots to describe the relationship between backpack weight, body weight, gender and reported backpain. Try both backpack_weight and body_weight as well as the ratio.

Hint: faceting would be useful here.

backpack_health <- backpack %>%
  clean_names()  %>%
  select(back_problems, backpack_weight, body_weight, sex,ratio) %>%
  mutate(back_problems = case_when(back_problems == 0 ~ "No problems",
                                   TRUE ~ "Back Problems")) 
backpack_health
p <-  backpack_health %>% 
          ggplot( aes(  x = body_weight, 
                        y = backpack_weight,
                        size = ratio, 
                        fill = sex)) +
  geom_point() +
  theme_bw() 

ggplotly(p)

This interactive plotly graph shows the relationships between body_weight, backpack_weight, the resultant ratio and sex.

It confirms what has been described previously but allows individual point examination by hovering over the points and we, therefore, see low body weights for females wih probable low muscle mass carrying large backpack weights.